Part 2.1: Features Engineering and VisualitionsΒΆ
This phase focuses on enhancing the dataset by creating meaningful features that capture temporal patterns, store-specific behaviors, customers and sales analysis. It also includes visual exploration to uncover trends, seasonality, and anomaliesβlaying the groundwork for robust forecasting models.
1. Setup & Imports LibrariesΒΆ
InΒ [1]:
import time
InΒ [2]:
# Step 1: Setup & Imports Libraries
print("Step 1: Setup and Import Libraries started...")
time.sleep(1) # Simulate processing time
Step 1: Setup and Import Libraries started...
InΒ [3]:
# Data Manipulation & Processing
import math
import numpy as np
import pandas as pd
from pathlib import Path
import scipy.stats as stats
from datetime import datetime
from sklearn.preprocessing import *
# Data Visualization
import seaborn as sbn
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pandas.plotting import scatter_matrix
# to ensure Plotly works in both Jupyter and HTML export
pio.renderers.default = "notebook+plotly_mimetype"
sbn.set(rc={'figure.figsize':(14,6)})
plt.style.use('seaborn-v0_8')
sbn.set_palette("husl")
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format','{:.2f}'.format)
# Warnings
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')
InΒ [4]:
print("="*60)
print("Rossman Store Sales Time Series Analysis - Part 2")
print("="*60)
print("All libraries imported successfully!")
print("Analysis Date:", pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'))
============================================================ Rossman Store Sales Time Series Analysis - Part 2 ============================================================ All libraries imported successfully! Analysis Date: 2025-08-13 21:14:32
InΒ [5]:
print("β
Setup and Import Liraries completed.\n")
β Setup and Import Liraries completed.
InΒ [6]:
# Start analysis
data_viz_begin = pd.Timestamp.now()
bold_start = '\033[1m'
bold_end = '\033[0m'
print("π Part 2 Started ...")
print(f"π’ Begin Date: {bold_start}{data_viz_begin.strftime('%Y-%m-%d %H:%M:%S')}{bold_end}\n")
π Part 2 Started ...
π’ Begin Date: 2025-08-13 21:14:32
Restore the fileΒΆ
InΒ [7]:
%store -r df_viz_feat
View or Display DatasetΒΆ
InΒ [8]:
print("\nTrain Data Preview:")
print("\n",df_viz_feat.head())
Train Data Preview:
store dayofweek date sales customers open promo stateholiday schoolholiday day week month quarter year isweekend isholiday isschoolDay
982643 1115 2 2013-01-01 0 0 0 No Promo Public 1 Tue 1 Jan 1 2013 False True False
982640 1112 2 2013-01-01 0 0 0 No Promo Public 1 Tue 1 Jan 1 2013 False True False
982639 1111 2 2013-01-01 0 0 0 No Promo Public 1 Tue 1 Jan 1 2013 False True False
982638 1110 2 2013-01-01 0 0 0 No Promo Public 1 Tue 1 Jan 1 2013 False True False
982637 1109 2 2013-01-01 0 0 0 No Promo Public 1 Tue 1 Jan 1 2013 False True False
2. Data VisualizationΒΆ
InΒ [9]:
# Step 1: Setup & Imports Libraries
print("Step 2: Data Visualization started...")
time.sleep(1) # Simulate processing time
Step 2: Data Visualization started...
Percentage Distribution per OpenΒΆ
InΒ [10]:
# Count frequency of each unique value
value_counts = df_viz_feat['open'].value_counts()
# Map numeric labels to descriptive ones using if-else logic
labels = ["Open" if val == 1 else "Closed" for val in value_counts.index]
values = value_counts.values.tolist()
# Dynamically create 'pull' values to highlight the largest slice
pull = [0.1 if i == 0 else 0 for i in range(len(labels))]
# Create pie chart
fig = go.Figure(data=[go.Pie(
labels=labels,
values=values,
pull=pull,
textinfo='percent+label',
hoverinfo='label+value+percent'
)])
# Update layout with left-aligned title
fig.update_layout(
title_text='π Store Status Distribution: Open vs Closed',
title_x=0.0, # Left-aligned title
showlegend=True,
width=1200, # Increased width
height=450 # Increased height
)
fig.show(config={'displayModeBar': True, 'displaylogo': False})
# export_plotly_chart(fig, name="open_vs_closed_pie")
Percentage Distribution with Respect to PromoΒΆ
InΒ [11]:
# Get value counts (Series), extracting both labels and values in matching order
counts = df_viz_feat['promo'].value_counts()
labels = counts.index.tolist()
values = counts.values.tolist()
# Create a 'pull' list to highlight the first class, rest unpulled
pull = [0.1 if i == 0 else 0 for i in range(len(labels))]
# Create the pie chart
fig = go.Figure(
data =[go.Pie(
labels = labels,
values = values,
pull = pull,
textinfo ='percent+label',
insidetextorientation ='radial',
)]
)
# Update the layout to fix title overlap
fig.update_layout(
title_text ='π % Distribution per Promo',
title_x = 1.0, # Centered title
title_font_size = 20, # Optional: adjust font size
margin = dict(t = 60, r = 60, b = 60, l = 60), # Increased top margin
width = 1200,
height = 450
)
fig.show(config={'displayModeBar': True, 'displaylogo': False})
# export_plotly_chart(fig, name="promo_pie")
Percentage Distribution per Holiday TypeΒΆ
InΒ [12]:
# Get value counts (Series), extracting both labels and values in matching order
counts = df_viz_feat['stateholiday'].value_counts()
labels = counts.index.tolist()
values = counts.values.tolist()
# Create a 'pull' list to highlight the first class, rest unpulled
pull = [0.1 if i == 0 else 0 for i in range(len(labels))]
# Create the pie chart
fig = go.Figure(
data =[go.Pie(
labels = labels,
values = values,
pull = pull,
textinfo ='percent+label',
insidetextorientation ='radial',
)]
)
# Update the layout to fix title overlap
fig.update_layout(
title_text ='π % Distribution per State Holiday',
title_x = 1.0, # Centered title
title_font_size = 20, # Optional: adjust font size
margin = dict(t = 60, r = 60, b = 60, l = 60), # Increased top margin
width = 1200,
height = 450
)
fig.show(config={'displayModeBar': True, 'displaylogo': False})
# export_plotly_chart(fig, name="sateholiday_pie")
Percentage Distribution per School HolidayΒΆ
InΒ [13]:
# Efficient value counting and labeling
counts = df_viz_feat['schoolholiday'].value_counts().sort_index()
labels = ['No School Holiday', 'School Holiday'] # Assumes 0=no, 1=yes
values = counts.values
# Create donut chart
fig = go.Figure(data=go.Pie(
labels=labels,
values=values,
pull=[0, 0.1], # Emphasize 'School Holiday' slice
hole=0.3,
textinfo='label+percent',
marker=dict(colors=['#4ECDC4', '#FF6B6B'], line=dict(color='#FFFFFF', width=2))
))
# Layout for clarity and aesthetics
fig.update_layout(
title_text = 'π % Distribution per School Holiday',
title_x = 0.5,
font = dict(size = 12, family = 'Arial, sans-serif'),
margin = dict(t = 60, b = 40, l = 40, r = 100),
width = 1200,
height = 450,
showlegend =True
)
# Annotation for total records
fig.add_annotation(
text = f"Total Records: {len(df_viz_feat):,}",
x = 0.5,
y =- 0.1,
xref ="paper",
yref = "paper",
showarrow = False,
font = dict(size = 10, color="gray")
)
fig.show(config={'displayModeBar': True, 'displaylogo': False})
# export_plotly_chart(fig, name="schoolholiday_pie")
Customer AnalysisΒΆ
Average Customers Trend per DayΒΆ
InΒ [14]:
# Define weekday order
weekday_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
# Calculate mean sales by day
dow_agg = df_viz_feat.groupby('day')['customers'].mean().reset_index()
# Apply categorical ordering
dow_agg['day'] = pd.Categorical(dow_agg['day'], categories=weekday_order, ordered=True)
dow_agg = dow_agg.sort_values('day')
# Create simple line chart
fig = px.line(
dow_agg,
x = 'day',
y ='customers',
title ='π Average Customers by Day of Week',
markers = True
)
# Simple styling
fig.update_layout(
title={'x': 0.5, 'xanchor': 'center'},
xaxis_title ='Day of Week',
yaxis_title ='Average Customers',
)
# Find peak for simple annotation
peak_idx = dow_agg['customers'].idxmax()
peak_day = dow_agg.loc[peak_idx, 'day']
peak_value = dow_agg.loc[peak_idx, 'customers']
# Simple annotation
fig.add_annotation(
x = peak_day,
y = peak_value,
text = f"Peak: {peak_day} ({peak_value:.0f})",
showarrow = True,
arrowcolor ='red',
font = dict(color = 'red')
)
fig.update_layout(margin=dict(t=60, r=40, b=40, l=40), width = 1200,height = 400)
fig.show(config={'displayModeBar': True, 'displaylogo': False})
print("Daily Sales Data:")
print(dow_agg)
# export_plotly_chart(fig, name="daily_avg_customers_trend")
Daily Sales Data: day customers 1 Mon 812.93 5 Tue 761.86 6 Wed 721.20 4 Thu 695.78 0 Fri 742.53 2 Sat 658.76 3 Sun 35.58
Average Customers Trend per MonthΒΆ
InΒ [15]:
# Ensure month is ordered
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun','Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_grp = df_viz_feat.groupby('month', as_index=False)['customers'].mean()
monthly_grp['month'] = pd.Categorical(monthly_grp['month'], categories=month_order, ordered=True)
monthly_grp = monthly_grp.sort_values('month')
# Identify peak
peak_row = monthly_grp.loc[monthly_grp['customers'].idxmax()]
peak_month = peak_row['month']
peak_value = peak_row['customers']
# Create Plotly Express line plot
fig = px.line(monthly_grp, x='month', y='customers',
markers=True,
title='π Average Customers Trend per Month',
labels={'customers': 'Average Customers', 'month': 'Month'},
line_shape='linear') # You can switch to 'spline' for smooth curves
# Annotate peak
fig.add_annotation(x=peak_month, y=peak_value,
text=f'Peak: {peak_month} ({peak_value:.1f})',
showarrow=True,
arrowhead=2,
arrowsize=1,
arrowwidth=2,
arrowcolor='red',
font=dict(color='red', size=12),
yshift=15)
fig.update_layout(margin=dict(t=60, r=40, b=40, l=40),width = 1200,height = 450)
fig.show(config={'displayModeBar': True, 'displaylogo': False})
# export_plotly_chart(fig, name="monthly_avg_customers_trend")
Sales AnalysisΒΆ
Average SalesTrend per DayΒΆ
InΒ [16]:
# Define weekday order
weekday_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
# Calculate mean sales by day
dow_agg = df_viz_feat.groupby('day')['sales'].mean().reset_index()
# Apply categorical ordering
dow_agg['day'] = pd.Categorical(dow_agg['day'], categories=weekday_order, ordered=True)
dow_agg = dow_agg.sort_values('day')
# Create simple line chart
fig = px.line(
dow_agg,
x = 'day',
y ='sales',
title ='π Average Sales by Day of Week',
markers = True
)
# Simple styling
fig.update_layout(
title={'x': 0.5, 'xanchor': 'center'},
xaxis_title ='Day of Week',
yaxis_title ='Average Sales',
)
# Find peak for simple annotation
peak_idx = dow_agg['sales'].idxmax()
peak_day = dow_agg.loc[peak_idx, 'day']
peak_value = dow_agg.loc[peak_idx, 'sales']
# Simple annotation
fig.add_annotation(
x = peak_day,
y = peak_value,
text = f"Peak: {peak_day} ({peak_value:.0f})",
showarrow = True,
arrowcolor ='red',
font = dict(color = 'red')
)
fig.update_layout(margin=dict(t=60, r=40, b=40, l=40), width = 1200,height = 400)
fig.show(config={'displayModeBar': True, 'displaylogo': False})
# export_plotly_chart(fig, name="daily_avg_sales_trend")
Average Sales Trend per MonthΒΆ
InΒ [17]:
# Ensure month is ordered
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun','Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_grp = df_viz_feat.groupby('month', as_index=False)['sales'].mean()
monthly_grp['month'] = pd.Categorical(monthly_grp['month'], categories=month_order, ordered=True)
monthly_grp = monthly_grp.sort_values('month')
# Identify peak
peak_row = monthly_grp.loc[monthly_grp['sales'].idxmax()]
peak_month = peak_row['month']
peak_value = peak_row['sales']
# Create Plotly Express line plot
fig = px.line(monthly_grp, x='month', y='sales',
markers=True,
title='π Average Sales Trend per Month',
labels={'sales': 'Average Sales', 'month': 'Month'},
line_shape='linear') # You can switch to 'spline' for smooth curves
# Annotate peak
fig.add_annotation(x=peak_month, y=peak_value,
text=f'Peak: {peak_month} ({peak_value:.1f})',
showarrow=True,
arrowhead=2,
arrowsize=1,
arrowwidth=2,
arrowcolor='red',
font=dict(color='red', size=12),
yshift=15)
fig.update_layout(margin=dict(t=60, r=40, b=40, l=40),width = 1200,height = 450)
fig.show(config={'displayModeBar': True, 'displaylogo': False})
# export_plotly_chart(fig, name="montly_avg_sales_trend")
Box Plots by Time SegmentΒΆ
InΒ [18]:
# Box plot by Month
fig1 = px.box(
df_viz_feat,
x='month',
y='sales',
title='Sales Distribution by Month',
category_orders={'month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun','Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']}
)
fig1.update_layout(title_x=0.5, width = 1200,height=500)
fig1.show()
# Box plot by Day of Week
fig2 = px.box(
df_viz_feat,
x='day',
y='sales',
title='Sales Distribution by Day of Week',
category_orders={'day': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']}
)
fig2.update_layout(title_x=0.5, width = 1200,height=500)
fig2.show()
# Box plot by Year
fig3 = px.box(
df_viz_feat,
x='year',
y='sales',
title='Sales Distribution by Year'
)
fig3.update_layout(title_x=0.5, width = 1200,height = 500)
fig.show(config={'displayModeBar': True, 'displaylogo': False})
# Simple summary statistics
print("Sales Distribution Summary by Category:")
print("=" * 45)
print("\nBy Month:")
monthly_stats = df_viz_feat.groupby('month')['sales'].agg(['mean', 'median', 'std']).round(0)
for month, stats in monthly_stats.iterrows():
print(f"{month}: Mean=β¬{stats['mean']:,.0f}, Median=β¬{stats['median']:,.0f}")
print("\nBy Day:")
daily_stats = df_viz_feat.groupby('day')['sales'].agg(['mean', 'median', 'std']).round(0)
for day, stats in daily_stats.iterrows():
print(f"{day}: Mean=β¬{stats['mean']:,.0f}, Median=β¬{stats['median']:,.0f}")